##########################################################################################################################################
###### This file creates matrixes that connect the firm PERMNOs and their NAICS industry classifications
###############################
# The dataframes saved here are later used in further preparation steps
##################################


########
# This is a datafile from Compustat containing the following elements:
# gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,cusip,conm,curcd,sale,exchg,cik,costat,naicsh,naics,sic
Compustat = read.csv("0a4a9373fde91d3a.csv", header=T)

######
# this is the Link Table from the CRSP-Compustat Merged database
# it contains the elements: gvkey,LPERMNO,LINKDT,LINKENDDT
Link_Table = read.table("LinkTable.txt",header=T,sep=",")
###########
######
# this is a datafile from CRSP containing the following elements:
# PERMNO	date	SICCD	TICKER	COMNAM	NAICS	PERMCO	CUSIP	PRC
CRSP = read.table("ac2f35cda7948e58.txt",header=T,sep="\t")

library(stringi)
#First we can remove all companies without any sales data:

Compustat = Compustat[!is.na(Compustat$sale),]

Compustat$datadate = as.Date(as.character(Compustat$datadate),format="%Y%m%d")
#This is why he starts at 1972, before we barely have any data:
table(format(Compustat$datadate,"%Y"))
#Just match the time of Grullon et al. (2019)
Compustat = Compustat[Compustat$datadate>"1972-01-01",]

#Use NAICSH whenever available, create a three digit representation of the NAICS and a four digit one:

Compustat$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(Compustat$naicsh),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
Compustat$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(Compustat$naicsh),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))
#If this is NA, then use CRSP NAICS

#The Link Table has multiple LPERMNOs with one gvkey:
#So also match them within the time-frame defined by LINKDT and LINKENDDT

Compustat$PERMNO = rep(NA,nrow(Compustat))
for(k in which(!duplicated(Compustat$gvkey))){
  if(length(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]])!=0){
    
    for(j in 1:length(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]])){
      Temp = Compustat[Compustat$gvkey == Compustat$gvkey[k],]
      Temp_Seq = c()
      if(is.na(as.numeric(format(as.Date(as.character(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")))){
        Temp_Seq = seq(from = as.numeric(format(as.Date(as.character(Link_Table$LINKDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")),
                       to = 2020)
        Temp_Seq = unique(Temp_Seq)
        Temp_Indik = Temp$fyear %in% Temp_Seq
        Temp = Temp[Temp$fyear %in% Temp_Seq,] 
        #
        
        Compustat[Compustat$gvkey == Compustat$gvkey[k],]$PERMNO[Temp_Indik] = rep(Link_Table$LPERMNO[Link_Table$gvkey %in% Compustat$gvkey[k]][j],nrow(Temp))
        
      }else{
        Temp_Seq = seq(from = as.numeric(format(as.Date(as.character(Link_Table$LINKDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")),
                       to = as.numeric(format(as.Date(as.character(Link_Table$LINKENDDT[Link_Table$gvkey %in% Compustat$gvkey[k]][j]),format="%Y%m%d"),"%Y")))
        
        Temp_Seq = unique(Temp_Seq)
        Temp_Indik = Temp$fyear %in% Temp_Seq
        Temp = Temp[Temp$fyear %in% Temp_Seq,] 
        #
        
        Compustat[Compustat$gvkey == Compustat$gvkey[k],]$PERMNO[Temp_Indik] = rep(Link_Table$LPERMNO[Link_Table$gvkey %in% Compustat$gvkey[k]][j],nrow(Temp))
      }
    }
  }
}


#Now do this, after we have matched the PERMNO to all data and conduct the analysis as beforehand:
NA_Subset = Compustat[is.na(Compustat$NAICS_3),]

#The NAICS of the CRSP data is not NA at some point in the future, depending on the year, so we must first 
#compile the CRSP data by company and add two columns telling us from when until when the NAICS was available
CRSP = CRSP[!is.na(CRSP$NAICS),]
CRSP$date = as.Date(as.character(CRSP$date),format="%Y%m%d") 
CRSP = CRSP[format(CRSP$date,"%m")=="12",]

#At Max we have 6 unique NAICS Codes per company!
CRSP_NAICS = data.frame(PERMNO = unique(CRSP$PERMNO), Start = rep(NA,length(unique(CRSP$PERMNO))), 
                        End = rep(NA,length(unique(CRSP$PERMNO))))

for(j in 1:nrow(CRSP_NAICS)){
  Temp = CRSP[CRSP$PERMNO==CRSP_NAICS$PERMNO[j],]
  CRSP_NAICS$Start[j] = format(Temp$date,"%Y")[1]
  CRSP_NAICS$End[j] = format(Temp$date,"%Y")[nrow(Temp)]

}

for(i in 1:nrow(CRSP_NAICS)){
  #If this PERMNO is as well included in the subset which is NA, so has no NAICSH
  if(sum(!is.na(NA_Subset[CRSP_NAICS$PERMNO[i]==NA_Subset$PERMNO,]))==0){
    
  }else{
    Temp = NA_Subset[!is.na(NA_Subset$PERMNO) & NA_Subset$PERMNO == CRSP_NAICS$PERMNO[i],]
    Interm = CRSP[CRSP$PERMNO==unique(Temp$PERMNO) & format(CRSP$date,"%Y") %in% Temp$fyear,]
    #If Interm is NULL, we have no CRSP observations that match the NA subset from Compustat in year and PERMNO
    if(nrow(Interm) == 0){}else{
      #Now I could simply fill in NAICS_3 and NAICS_4
      Temp$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(Interm$NAICS),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
      Temp$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(Interm$NAICS),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))
      
      NA_Subset[!is.na(NA_Subset$PERMNO) & NA_Subset$PERMNO == CRSP_NAICS$PERMNO[i],] = Temp
    }
  }
}
#Seems still to be working correctly (...)
which(CRSP_NAICS$PERMNO%in%NA_Subset$PERMNO)

#Now use the Compustat NAISC (naics)
NA_Subset[is.na(NA_Subset$NAICS_3),]$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(NA_Subset[is.na(NA_Subset$NAICS_3),]$naics),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
NA_Subset[is.na(NA_Subset$NAICS_4),]$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(NA_Subset[is.na(NA_Subset$NAICS_4),]$naics),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))

#Delete all 4 digit NAICS that are not 4 digits:
NA_Subset$NAICS_3[NA_Subset$NAICS_3<100] = NA
NA_Subset$NAICS_4[NA_Subset$NAICS_4<1000] = NA



NA_Subset[is.na(NA_Subset$NAICS_3),]
##############################
# These files contain the SIC to NAICS conversion
# the files contain the following information:
# SIC,	Part Indicator,	SIC Titles and Part Descriptions,	1997 NAICS,	1997 NAICS Titles and Part Indicators
#SIC 1987 to 1997 NAICS
SIC_NAICS97 <- read.table("1987_SIC_to_1997_NAICS.txt", header=T, sep="\t")
#1997 NAICS to 2002 NAICS
NAICS97_NAICS02 <- read.table("1997_NAICS_to_2002_NAICS.txt", header=T, sep="\t")
#2002 NAICS to 2007 NAICS
NAICS02_NAICS07 <- read.table("2002_to_2007_NAICS.txt", header=T, sep="\t")
#2007 NAICS to 2012 NAICS
NAICS07_NAICS12 <- read.table("2007_to_2012_NAICS.txt", header=T, sep="\t")
#2012 NAICS to 2017 NAICS
NAICS12_NAICS17 <- read.table("2012_to_2017_NAICS.txt", header=T, sep="\t")
###############


SIC_NAICS97 <- na.omit(SIC_NAICS97)
NAICS97_NAICS02 <- na.omit(NAICS97_NAICS02)
NAICS02_NAICS07 <- na.omit(NAICS02_NAICS07)

options(warn=2)

for(i in which(!duplicated(NA_Subset$sic))){
  Temp = NA_Subset[NA_Subset$sic == NA_Subset$sic[i] & is.na(NA_Subset$NAICS_3),]
  NAICS_Temp = c()
  if(length(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS)==0){}else{
    #Everything below 02 take this value:
    #can be more than one NAICS per SIC, pick the most NAICS
    if(length(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS)>1){
      library(pracma)
      NAICS_Temp = rep(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS[Mode(as.numeric(unlist(lapply(strsplit(as.character(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))) == 
                                                                                       as.numeric(unlist(lapply(strsplit(as.character(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))][1], nrow(Temp))
    }else if (length(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS) == 1){
      NAICS_Temp = rep(SIC_NAICS97[SIC_NAICS97$SIC == NA_Subset$sic[i],]$X1997.NAICS, nrow(Temp))
    }
    #Update all values above 02:
    NAICS_Temp[Temp$fyear>=2002] = rep(NAICS97_NAICS02[NAICS97_NAICS02$NAICS97 %in% NAICS_Temp,]$NAICS02[1], nrow(Temp))[Temp$fyear>=2002]
    #Update all values above 07:
    NAICS_Temp[Temp$fyear>=2007] = rep(NAICS02_NAICS07[NAICS02_NAICS07$X2002.NAICS.Code %in% NAICS_Temp,]$X2007.NAICS.Code[1], nrow(Temp))[Temp$fyear>=2007]
    #Update all values above 12:
    NAICS_Temp[Temp$fyear>=2012] = rep(NAICS07_NAICS12[NAICS07_NAICS12$X2007.NAICS.Code %in% NAICS_Temp,]$X2012.NAICS.Code[1], nrow(Temp))[Temp$fyear>=2012]
    #Update all values above 17:
    NAICS_Temp[Temp$fyear>=2017] = rep(NAICS12_NAICS17[NAICS12_NAICS17$X2012.NAICS.Code %in% NAICS_Temp,]$X2017.NAICS.Code[1], nrow(Temp))[Temp$fyear>=2017]
    
    #Save the temporary NAICS:
    Temp$NAICS_3 = as.numeric(unlist(lapply(strsplit(as.character(NAICS_Temp),split=""),function(x) stri_paste(head(x,n=3),collapse=""))))
    Temp$NAICS_4 = as.numeric(unlist(lapply(strsplit(as.character(NAICS_Temp),split=""),function(x) stri_paste(head(x,n=4),collapse=""))))
    
    NA_Subset[NA_Subset$sic == NA_Subset$sic[i] & is.na(NA_Subset$NAICS_3),] = Temp
  }
}

options(warn=1)

Compustat[is.na(Compustat$NAICS_3),] = NA_Subset

#Build a matrix on the LHS you have the month, on the top the PERMNO
Compustat = Compustat[!is.na(Compustat$NAICS_3),]
Compustat = Compustat[!is.na(Compustat$PERMNO),]

#Build this Matrix: 
load("Beta_M.RData")

unique(Compustat$PERMNO)

PERMNO_NAICS3 = matrix(NA,nrow=dim(Beta_M)[1],ncol=dim(Beta_M)[2])
PERMNO_NAICS3[,1] = Beta_M[,1]
PERMNO_NAICS3[1,] = Beta_M[1,]

library(zoo)

options(warn=2)
for(i in which(!duplicated(Compustat$PERMNO))){
  Temp = Compustat[Compustat$PERMNO == Compustat$PERMNO[i],]
  PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])][
    as.numeric(unlist(lapply(strsplit(as.character(PERMNO_NAICS3[,1]),split=""),function(x) stri_paste(head(x,n=6),collapse=""))))%in%as.numeric(paste0(Temp$fyear,"12"))] = Temp$NAICS_3[!duplicated(Temp$fyear)]
  PERMNO_NAICS3[2:length(PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])]),which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])] = na.approx(PERMNO_NAICS3[2:length(PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])]),which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])],na.rm=F,method="constant")
#Take the last value and extrapolate over the next 11 months with the last value this vector has to offer
  Which_Last = which(!is.na(PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])]))[length(which(!is.na(PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])])))]
  if(length(Which_Last)!=0){
  if(Which_Last!=dim(PERMNO_NAICS3)[1]){
  Interm = PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])]
  Interm[(Which_Last+1):(Which_Last+11)] = rep(PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])][Which_Last],11)  
  PERMNO_NAICS3[,which(Compustat$PERMNO[i] == PERMNO_NAICS3[1,])] = Interm
  }
  }
}
options(warn=1)



PERMNO_NAICS4 = matrix(NA,nrow=dim(Beta_M)[1],ncol=dim(Beta_M)[2])
PERMNO_NAICS4[,1] = Beta_M[,1]
PERMNO_NAICS4[1,] = Beta_M[1,]

options(warn=2)
for(i in which(!duplicated(Compustat$PERMNO))){
  Temp = Compustat[Compustat$PERMNO == Compustat$PERMNO[i],]
  PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])][
    as.numeric(unlist(lapply(strsplit(as.character(PERMNO_NAICS4[,1]),split=""),function(x) stri_paste(head(x,n=6),collapse=""))))%in%as.numeric(paste0(Temp$fyear,"12"))] = Temp$NAICS_4[!duplicated(Temp$fyear)]
  PERMNO_NAICS4[2:length(PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])]),which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])] = na.approx(PERMNO_NAICS4[2:length(PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])]),which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])],na.rm=F,method="constant")
  #Take the last value and extrapolate over the next 11 months with the last value this vector has to offer
  Which_Last = which(!is.na(PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])]))[length(which(!is.na(PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])])))]
  if(length(Which_Last)!=0){
    if(Which_Last!=dim(PERMNO_NAICS4)[1]){
      Interm = PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])]
      Interm[(Which_Last+1):(Which_Last+11)] = rep(PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])][Which_Last],11)  
      PERMNO_NAICS4[,which(Compustat$PERMNO[i] == PERMNO_NAICS4[1,])] = Interm
    }
  }
}
options(warn=1)


save(PERMNO_NAICS3,file="PERMNO_NAICS3.RData")

save(PERMNO_NAICS4,file="PERMNO_NAICS4.RData")


